Libraries
Load data
I am using the AHA dataset for writing my code because it is the
simplest dataset I can work with. Simplest = significant number of
baseline samples without issue of repeated measures, balanced
distribution of classes, likely difference to be detected present.
p_ps = readRDS("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Ammara_A/Projects/POMMS/R24_POMMS/data/processed/20221104_pomms_metadata_plant.rds")
p_ps
sample_data(p_ps)
Preprocess
Remove problematic samples
Taking out samples with less than or equal to 0 reads and keeping
only entry timepoint reduces samples from 384 down to 240.
p_ps = p_ps %>%
subset_samples( reads > 0) %>%
subset_samples(timepoint == "Entry")
p_ps
Class distribution in this dataset: Case = 193 Control = 47
sample_data(p_ps) %>%
as.data.frame() %>%
as_tibble() %>%
group_by(group) %>%
summarise(count = n())
Feature table
Attach data labels
#Subsampling: SMOTE
Zero variance features
No features have zero variance
nzv = subsampled %>%
select(!group) %>%
nearZeroVar(saveMetrics = TRUE)
rm(nzv)
Linear dependencies
feature_ld = filtered_subsampled %>%
select(!group) %>%
findLinearCombos()
filtered_subsampled = filtered_subsampled[, - feature_ld$remove]
Modeling
Load functions from classification_function_AA file
results
[[1]]
[[2]]
NA
rm(label_list)
Plotting
AUC plot and t-test
auc_df %>%
pivot_longer(cols = c("group", "group_shuffle"), names_to = "feature", values_to = "AUC") %>%
ggplot()+
geom_boxplot(aes(x=feature, y = AUC))+
theme_classic()

t.test(auc_df$group, auc_df$group_shuffle)
Welch Two Sample t-test
data: auc_df$group and auc_df$group_shuffle
t = 21.57, df = 15.18, p-value = 8.37e-13
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
0.3677251 0.4482749
sample estimates:
mean of x mean of y
0.942 0.534
Importance plots
look-up table
#Making asv reference for names in model
lookup_asv = tax_table(p_ps) %>%
data.frame() %>%
lowest_level() %>%
rownames_to_column("asv") %>%
select(asv, name) %>%
mutate(name = make.names(make.unique(name)))
top 10 ranked list
# Getting the ranked list
ranking = importance_summary %>%
filter(variable == "group_shuffle") %>%
pull(food)
top10 = ranking[1:10]
Importance plot
importance_df %>%
pivot_longer(cols = c("Helianthus.annuus":"Rheum.rhaponticum"), names_to = "food", values_to = "importance") %>%
filter(variable == "group_shuffle" & food %in% top10) %>%
ggplot()+
geom_boxplot(aes(x= reorder(food,-importance), y = importance))+
labs(x= "Food", y ="Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust=1))

Some of the important NAs for predictions only map to bacterial
species in BLAST.
Important taxa directions










lookup_asv %>%
filter(name == "Theobroma.cacao") %>%
pull(asv)
[1] "ATCCTATTATTTTATTATTTTACGAAACTAAACAAAGGTTCAGCAAGCGAGAATAATAAAAAAAG"
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyBMaWJyYXJpZXMKCmBgYHtyLCBpbmNsdWRlID0gRkFMU0V9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KG1pY3JvYmlvbWUpCmxpYnJhcnkocGh5bG9zZXEpCmxpYnJhcnkoY2FyZXQpCmxpYnJhcnkocmFuZG9tRm9yZXN0KQpsaWJyYXJ5KE1CdXRpbHMpCmxpYnJhcnkoTUxldmFsKQpsaWJyYXJ5KERNd1IpCmBgYAoKIyBMb2FkIGRhdGEKCkkgYW0gdXNpbmcgdGhlIEFIQSBkYXRhc2V0IGZvciB3cml0aW5nIG15IGNvZGUgYmVjYXVzZSBpdCBpcyB0aGUgc2ltcGxlc3QgZGF0YXNldCBJIGNhbiB3b3JrIHdpdGguClNpbXBsZXN0ID0gc2lnbmlmaWNhbnQgbnVtYmVyIG9mIGJhc2VsaW5lIHNhbXBsZXMgd2l0aG91dCBpc3N1ZSBvZiByZXBlYXRlZCBtZWFzdXJlcywgYmFsYW5jZWQgZGlzdHJpYnV0aW9uIG9mIGNsYXNzZXMsIGxpa2VseSBkaWZmZXJlbmNlIHRvIGJlIGRldGVjdGVkIHByZXNlbnQuCgoKYGBge3J9CnBfcHMgPSByZWFkUkRTKCIvVXNlcnMvYWEzNzAvTGlicmFyeS9DbG91ZFN0b3JhZ2UvQm94LUJveC9wcm9qZWN0X2RhdmlkbGFiL0xBRF9MQUJfUGVyc29ubmVsL0FtbWFyYV9BL1Byb2plY3RzL1BPTU1TL1IyNF9QT01NUy9kYXRhL3Byb2Nlc3NlZC8yMDIyMTEwNF9wb21tc19tZXRhZGF0YV9wbGFudC5yZHMiKQoKcF9wcwoKc2FtcGxlX2RhdGEocF9wcykKYGBgCgojIFByZXByb2Nlc3MKCiMjIFJlbW92ZSBwcm9ibGVtYXRpYyBzYW1wbGVzCgpUYWtpbmcgb3V0IHNhbXBsZXMgd2l0aCBsZXNzIHRoYW4gb3IgZXF1YWwgdG8gMCByZWFkcyBhbmQga2VlcGluZyBvbmx5IGVudHJ5IHRpbWVwb2ludCByZWR1Y2VzIHNhbXBsZXMgZnJvbSAzODQgZG93biB0byAyNDAuCgpgYGB7cn0KcF9wcyA9IHBfcHMgJT4lCiAgc3Vic2V0X3NhbXBsZXMoIHJlYWRzID4gMCkgJT4lCiAgc3Vic2V0X3NhbXBsZXModGltZXBvaW50ID09ICJFbnRyeSIpCgpwX3BzCmBgYAoKQ2xhc3MgZGlzdHJpYnV0aW9uIGluIHRoaXMgZGF0YXNldDoKQ2FzZSA9IDE5MwpDb250cm9sID0gNDcKCmBgYHtyfQpzYW1wbGVfZGF0YShwX3BzKSAlPiUKICBhcy5kYXRhLmZyYW1lKCkgJT4lCiAgYXNfdGliYmxlKCkgJT4lCiAgZ3JvdXBfYnkoZ3JvdXApICU+JQogIHN1bW1hcmlzZShjb3VudCA9IG4oKSkKYGBgCgojIyBEYXRhIHRyYW5zZm9ybQoKYGBge3J9Cm90dV9jbHIgPSBhYnVuZGFuY2VzKHBfcHMsICJjbHIiKSAlPiUgIyByY2xyIHRyYW5zZm9ybSBjb3VsZCBub3QgYmUgcGVyZm9ybWVkLCBnZXQgTmFucwogIHQoKQoKcF9wcyA9IHBoeWxvc2VxKG90dV90YWJsZShvdHVfY2xyLCB0YXhhX2FyZV9yb3dzID0gRkFMU0UpLAogICAgICAgICAgICAgICAgICAgc2FtcGxlX2RhdGEoc2FtcGxlX2RhdGEocF9wcykpLAogICAgICAgICAgICAgICAgICAgdGF4X3RhYmxlKHRheF90YWJsZShwX3BzKSkpCgpybShvdHVfY2xyKQpgYGAKCiMjIEZlYXR1cmUgdGFibGUKCiMjIyBFeHRyYWN0IGRhdGEKCk9yZGVyIG9mIGFzdiBjb2xuYW1lcyBpbiBPVFUgdGFibGUgYW5kIHRheCB0YWJsZSBpcyB0aGUgc2FtZS4KYGBge3J9CmZlYXR1cmVzID0gcF9wc0BvdHVfdGFibGUgJT4lCiAgYXMuZGF0YS5mcmFtZSgpICU+JQogIGFzX3RpYmJsZShyb3duYW1lcyA9IE5BKSAlPiUKICByb3duYW1lc190b19jb2x1bW4oInNhbXBsZSIpCgpmZWF0dXJlX2xhYmVscyA9IGMoInNhbXBsZSIpCgogbmFtZXMgPSB0YXhfdGFibGUocF9wcykgJT4lCiAgICAgIGRhdGEuZnJhbWUoKSAlPiUKICAgICAgbG93ZXN0X2xldmVsKCkgJT4lCiAgIHB1bGwobmFtZSkKCiBmZWF0dXJlX2xhYmVscyA9IGFwcGVuZChmZWF0dXJlX2xhYmVscywgbmFtZXMpCgogZmVhdHVyZV9sYWJlbHMgPSBtYWtlLnVuaXF1ZShmZWF0dXJlX2xhYmVscykgJT4lCiAgIG1ha2UubmFtZXMoKQoKIGNvbG5hbWVzKGZlYXR1cmVzKSA9IGZlYXR1cmVfbGFiZWxzCgpmZWF0dXJlcwpgYGAKIyMjIEF0dGFjaCBkYXRhIGxhYmVscwpgYGB7cn0Kc2FtcGxlX2xhYmVscyA9IHBfcHNAc2FtX2RhdGElPiUKICBhcy5kYXRhLmZyYW1lKCkgJT4lCiAgYXNfdGliYmxlKHJvd25hbWVzID0gTkEpICU+JQogIHJvd25hbWVzX3RvX2NvbHVtbigic2FtcGxlIikgJT4lCiAgc2VsZWN0KHNhbXBsZSwgZ3JvdXApCgpmZWF0dXJlcyA9IHNhbXBsZV9sYWJlbHMgJT4lCiAgbGVmdF9qb2luKGZlYXR1cmVzKQpgYGAKI1N1YnNhbXBsaW5nOiBTTU9URQoKYGBge3J9CnNtb3RlX2lucHV0ID0gZmVhdHVyZXMgJT4lCiAgc2VsZWN0KC1zYW1wbGUpICU+JQogIG11dGF0ZShncm91cCA9IGFzLmZhY3Rvcihncm91cCkpICU+JQogIGFzLmRhdGEuZnJhbWUoKQoKc3Vic2FtcGxlZCA9IFNNT1RFKGdyb3VwIH4gLiwgZGF0YSAgPSBzbW90ZV9pbnB1dCkKCnN1YnNhbXBsZWQlPiUKICBncm91cF9ieShncm91cCkgJT4lCiAgc3VtbWFyaXNlKGNvdW50ID0gbigpKQpgYGAKIyMjIFplcm8gdmFyaWFuY2UgZmVhdHVyZXMKCk5vIGZlYXR1cmVzIGhhdmUgemVybyB2YXJpYW5jZQpgYGB7cn0Kbnp2ID0gc3Vic2FtcGxlZCAlPiUKICBzZWxlY3QoIWdyb3VwKSAlPiUKICBuZWFyWmVyb1ZhcihzYXZlTWV0cmljcyA9IFRSVUUpCmBgYAoKYGBge3J9CnJtKG56dikKYGBgCgojIyMgQ29ycmVsYXRlZCBwcmVkaWN0b3JzCgpgYGB7cn0KZmVhdHVyZV9jb3IgPSBzdWJzYW1wbGVkICU+JQogc2VsZWN0KCFncm91cCkgJT4lCiBjb3IoKSAKCnN1bW1hcnkoZmVhdHVyZV9jb3JbdXBwZXIudHJpKGZlYXR1cmVfY29yKV0pCmBgYAoKYGBge3J9CmhpZ2hfY29yID0gZmluZENvcnJlbGF0aW9uKGZlYXR1cmVfY29yLCBjdXRvZmYgPSAuNzUpCgpmaWx0ZXJlZF9zdWJzYW1wbGVkID0gc3Vic2FtcGxlZFssLWhpZ2hfY29yXQpgYGAKCmBgYHtyfQpybShoaWdoX2NvcikKcm0oZmVhdHVyZV9jb3IpCmBgYAoKIyMjIExpbmVhciBkZXBlbmRlbmNpZXMKCmBgYHtyfQpmZWF0dXJlX2xkID0gZmlsdGVyZWRfc3Vic2FtcGxlZCAlPiUKICBzZWxlY3QoIWdyb3VwKSAlPiUKICBmaW5kTGluZWFyQ29tYm9zKCkKCmZpbHRlcmVkX3N1YnNhbXBsZWQgPSBmaWx0ZXJlZF9zdWJzYW1wbGVkWywgLSBmZWF0dXJlX2xkJHJlbW92ZV0KYGBgCgojIyBJbnB1dCB0YWJsZQoKUHJlcHJvY2Vzc2luZyByZWR1Y2VzIGNvbHVtbiBudW1iZXIgdG8gMTAyIGZyb20gMjM1CgpgYGB7cn0KaW5wdXQgPSBmaWx0ZXJlZF9zdWJzYW1wbGVkICU+JQogIG11dGF0ZShncm91cF9zaHVmZmxlID0gc2FtcGxlKGdyb3VwKSkgJT4lCiAgcm93bmFtZXNfdG9fY29sdW1uKCJzYW1wbGUiKQpgYGAKCiMgTW9kZWxpbmcKCkxvYWQgZnVuY3Rpb25zIGZyb20gY2xhc3NpZmljYXRpb25fZnVuY3Rpb25fQUEgZmlsZQoKYGBge3J9CmxhYmVsX2xpc3QgPSBjKCJncm91cCIsICJncm91cF9zaHVmZmxlIikKcmVzdWx0cyA9IGxvb3BfbGFiZWwoaXRlcmF0aW9ucyA9IDEwLCBpbnB1dCwgbGFiZWxfbGlzdCwgcGVyY2VudCA9IDAuNykKCmF1Y19kZiA9IHJlc3VsdHNbWzJdXQoKaW1wb3J0YW5jZV9kZiA9IHJlc3VsdHNbWzFdXQpgYGAKCmBgYHtyfQpybShsYWJlbF9saXN0KQpgYGAKCiMgUGxvdHRpbmcKCiMjIEFVQyBwbG90IGFuZCB0LXRlc3QKYGBge3J9CmF1Y19kZiAlPiUKICBwaXZvdF9sb25nZXIoY29scyA9IGMoImdyb3VwIiwgImdyb3VwX3NodWZmbGUiKSwgbmFtZXNfdG8gPSAiZmVhdHVyZSIsIHZhbHVlc190byA9ICJBVUMiKSAlPiUKICBnZ3Bsb3QoKSsKICBnZW9tX2JveHBsb3QoYWVzKHg9ZmVhdHVyZSwgeSA9IEFVQykpKwogIHRoZW1lX2NsYXNzaWMoKQoKdC50ZXN0KGF1Y19kZiRncm91cCwgYXVjX2RmJGdyb3VwX3NodWZmbGUpCgpgYGAKCiMjIEltcG9ydGFuY2UgcGxvdHMKCiMjIyBsb29rLXVwIHRhYmxlCmBgYHtyfQojTWFraW5nIGFzdiByZWZlcmVuY2UgZm9yIG5hbWVzIGluIG1vZGVsCmxvb2t1cF9hc3YgPSB0YXhfdGFibGUocF9wcykgJT4lIAogICAgIGRhdGEuZnJhbWUoKSAlPiUKICAgICAgbG93ZXN0X2xldmVsKCkgJT4lCiAgIHJvd25hbWVzX3RvX2NvbHVtbigiYXN2IikgJT4lIAogICBzZWxlY3QoYXN2LCBuYW1lKSAlPiUKICAgbXV0YXRlKG5hbWUgPSBtYWtlLm5hbWVzKG1ha2UudW5pcXVlKG5hbWUpKSkKYGBgCgojIyMgc3VtbWFyaXplIGltcG9ydGFuY2VzCmBgYHtyfQojIE1ha2luZyBzdW1tYXJ5IGRmIG9mIGltcG9ydGFuY2VzCmltcG9ydGFuY2Vfc3VtbWFyeSA9IGltcG9ydGFuY2VfZGYgJT4lCiAgcGl2b3RfbG9uZ2VyKGNvbHMgPSBjKCJIZWxpYW50aHVzLmFubnV1cyI6IlJoZXVtLnJoYXBvbnRpY3VtIiksIG5hbWVzX3RvID0gImZvb2QiLCB2YWx1ZXNfdG8gPSAiaW1wb3J0YW5jZSIpICU+JQogIGdyb3VwX2J5KHZhcmlhYmxlLCBmb29kKSAlPiUKICBzdW1tYXJpc2UobWVhbl9pbXBvcnRhbmNlID0gbWVhbihpbXBvcnRhbmNlKSkgJT4lCiAgYXJyYW5nZSh2YXJpYWJsZSwgZGVzYyhtZWFuX2ltcG9ydGFuY2UpKQpgYGAKIyMjIHRvcCAxMCByYW5rZWQgbGlzdApgYGB7cn0KIyBHZXR0aW5nIHRoZSByYW5rZWQgbGlzdApyYW5raW5nID0gaW1wb3J0YW5jZV9zdW1tYXJ5ICU+JQogIGZpbHRlcih2YXJpYWJsZSA9PSAiZ3JvdXBfc2h1ZmZsZSIpICU+JQogIHB1bGwoZm9vZCkKCnRvcDEwID0gcmFua2luZ1sxOjEwXQpgYGAKCiMjIyBJbXBvcnRhbmNlIHBsb3QKYGBge3J9CmltcG9ydGFuY2VfZGYgJT4lCiAgcGl2b3RfbG9uZ2VyKGNvbHMgPSBjKCJIZWxpYW50aHVzLmFubnV1cyI6IlJoZXVtLnJoYXBvbnRpY3VtIiksIG5hbWVzX3RvID0gImZvb2QiLCB2YWx1ZXNfdG8gPSAiaW1wb3J0YW5jZSIpICAlPiUKICBmaWx0ZXIodmFyaWFibGUgPT0gImdyb3VwX3NodWZmbGUiICYgZm9vZCAlaW4lIHRvcDEwKSAlPiUKICBnZ3Bsb3QoKSsKICBnZW9tX2JveHBsb3QoYWVzKHg9IHJlb3JkZXIoZm9vZCwtaW1wb3J0YW5jZSksIHkgPSBpbXBvcnRhbmNlKSkrCiAgbGFicyh4PSAiRm9vZCIsIHkgPSJJbXBvcnRhbmNlIikgKwogIHRoZW1lX2NsYXNzaWMoKSArCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA0NSwgaGp1c3Q9MSkpCmBgYAoKU29tZSBvZiB0aGUgaW1wb3J0YW50IE5BcyBmb3IgcHJlZGljdGlvbnMgb25seSBtYXAgdG8gYmFjdGVyaWFsIHNwZWNpZXMgaW4gQkxBU1QuCgojIEltcG9ydGFudCB0YXhhIGRpcmVjdGlvbnMKCmBgYHtyfQpzYW1wbGVfZGF0YShwX3BzKSAlPiUKICBkYXRhLmZyYW1lKCkgJT4lCiAgZmlsdGVyKHRpbWVwb2ludCA9PSAiRW50cnkiKSAlPiUKICBnZ3Bsb3QoKSArCiAgZ2VvbV9ib3hwbG90KGFlcyh4PSBncm91cCwgeSA9IGJtaSkpKwogIHRoZW1lX2NsYXNzaWMoKQoKcGxvdF9mb29kX2RpcmVjdGlvbih0b3AxMCwgZmVhdHVyZXMpCmBgYAoKCmBgYHtyfQpsb29rdXBfYXN2ICU+JQogIGZpbHRlcihuYW1lID09ICJUaGVvYnJvbWEuY2FjYW8iKSAlPiUKICBwdWxsKGFzdikKYGBgCgoK